Main analyses
Below is the code to reproduce all figures in the main text of the manuscript.
Figure 2: Action patterns over time
We dailyrize the data.
df_seq <- dailyrize(df_act_clean, habits_info) %>%
arrange(sub, habit_name)
Get action sequences of 20 random users.
set.seed(1)
users <- sample(unique(df_act_clean$sub), size = 20)
psequences <- plot_habit_seq_many(df_seq, users)
We calculate segment size, average gap, and dice similarity.
# Now aggregate over weeks
df_weekly <- df_seq %>%
mutate(
year = year(day),
week = week(day)
) %>%
group_by(sub, habit_name, year, week) %>%
summarize(week_count = sum(value))
df_prep <- df_seq %>%
group_by(sub, habit_name) %>%
mutate(time = seq(n()))
df_wide <- df_prep %>%
pivot_wider(names_from = habit_name, values_from = value, values_fill = 0) %>%
dplyr::select(-day)
habit_names <- colnames(df_wide)[-seq(2)]
colnames(df_wide) <- c('sub', 'time', paste0('habit', seq(length(habit_names))))
process_group <- function(data, filter_d = TRUE) {
data <- data %>%
mutate(all_habits_zero = rowSums(select(., starts_with('habit'))) == 0) %>%
filter(!all_habits_zero) %>%
mutate(
prev_diff = time - lag(time, default = time[1]),
next_diff = lead(time, default = time[length(time)]) - time
)
if (filter_d) {
data <- data %>%
filter(prev_diff == 1 | next_diff == 1) %>%
select(-prev_diff, -next_diff, -all_habits_zero)
}
data
}
# Remove rows with all 0 entries
df_filtered <- df_wide %>%
group_by(sub) %>%
# Do not only take consecutive observations (as this removes users that don't have those)
group_map(~ process_group(.x, filter_d = FALSE), .keep = TRUE) %>%
bind_rows()
df_consistency <- df_filtered %>%
group_by(sub) %>%
arrange(sub, time) %>%
mutate(across(starts_with('habit'),
~ ifelse(is.na(lag(.)), NA, . == lag(.)),
.names = 'match_{col}'))
# Identify continuous segments on all data
df_segment <- df_wide %>%
group_by(sub) %>%
group_map(~ process_group(.x, filter_d = FALSE), .keep = TRUE) %>%
bind_rows() %>%
mutate(segment = cumsum(lag(time, default = first(time)) - time != -1)) %>%
ungroup() %>%
select(sub, time, segment, everything())
df_avg_segment <- df_segment %>%
group_by(sub, segment) %>%
summarize(segment_n = n()) %>%
group_by(sub) %>%
summarize(
segment_mean = mean(segment_n),
segment_median = median(segment_n),
segment_max = max(segment_n)
) %>%
arrange(desc(segment_mean))
df_avg_gap <- df_segment %>%
group_by(sub) %>%
arrange(time) %>%
# Difference in time between segments
mutate(segment_time_diff = time - lag(time, default = first(time))) %>%
# Only get the first difference
group_by(sub, segment) %>%
summarise(start_time_diff = first(segment_time_diff)) %>%
# Average time differences
group_by(sub) %>%
summarise(
time_diff_mean = mean(start_time_diff, na.rm = TRUE),
time_diff_median = median(start_time_diff, na.rm = TRUE),
time_diff_max = max(start_time_diff, na.rm = TRUE)
) %>%
ungroup()
df_avg_seggap <- df_avg_segment %>%
left_join(df_avg_gap, by = 'sub')
dice_similarity <- function(row1, row2) {
intersection <- sum(row1 & row2)
total <- sum(row1) + sum(row2)
if (total == 0) return(NA)
return(2 * intersection / total)
}
get_similarity_df <- function(df_consistency, similarity_fn) {
uniq_subs <- unique(df_consistency$sub)
df_sim <- data.frame(sub = NA, similarity = NA)
for (sub in uniq_subs) {
df_sel <- df_consistency %>%
filter(sub == !!sub) %>%
ungroup() %>%
select(habit_cols)
sims <- c()
for (i in seq(2, nrow(df_sel))) {
similarity <- similarity_fn(as.numeric(df_sel[i, ]), as.numeric(df_sel[i - 1, ]))
sims <- c(sims, similarity)
}
df_sim <- rbind(df_sim, c(sub, mean(sims, na.rm = TRUE)))
}
df_sim <- df_sim[-1, ]
df_sim$similarity <- as.numeric(df_sim$similarity)
df_sim
}
habit_cols <- grep('^habit', names(df_consistency), value = TRUE)
df_dice <- get_similarity_df(df_consistency, dice_similarity)
pconsistency <- ggplot(df_dice, aes(x = similarity)) +
geom_histogram(aes(y = after_stat(count / sum(count)))) +
scale_x_continuous(breaks = seq(0, 1, 0.10), limits = c(0.00, 1.0)) +
theme_minimal() +
ylab('') +
xlab('Mean Dice similarity') +
ggtitle('Mean similarity of action reports') +
theme(
plot.title = element_text(size = 14, hjust = 0.50), legend.position = 'none'
)
psegment <- ggplot(df_avg_seggap, aes(x = segment_mean)) +
geom_histogram(aes(y = after_stat(count / sum(count)))) +
scale_x_continuous(trans = 'log10', breaks = c(1, 2, 3, 4, 5, 10, 20, 40, 100)) +
theme_minimal() +
ylab('') +
xlab('Mean segment size') +
ggtitle('Mean size of continuous segment') +
theme(
plot.title = element_text(size = 14, hjust = 0.50), legend.position = 'none'
)
pgap <- ggplot(df_avg_seggap, aes(x = time_diff_mean)) +
geom_histogram(aes(y = after_stat(count / sum(count)))) +
theme_minimal() +
scale_x_continuous(breaks = seq(0, 160, 20)) +
ylab('Proportion') +
xlab('Mean gap size') +
ggtitle('Mean size of gap between segments') +
theme(
plot.title = element_text(size = 14, hjust = 0.50), legend.position = 'none'
)
c(mean(df_dice$similarity), sd(df_dice$similarity))
## [1] 0.3980062 0.2251452
c(mean(df_avg_seggap$segment_mean), sd(df_avg_seggap$segment_mean))
## [1] 2.100084 4.252741
c(mean(df_avg_seggap$time_diff_mean), sd(df_avg_seggap$time_diff_mean))
## [1] 14.83381 20.16450
We combine these three figures.
pdf('figures/Figure2.pdf', width = 9, height = 9)
grid.arrange(
psequences, psegment, pgap, pconsistency, ncol = 2,
layout_matrix = matrix(c(1, 1, 1, 2, 3, 4), nrow = 3, ncol = 2)
)
dev.off()
## quartz_off_screen
## 2
grid.arrange(
psequences, psegment, pgap, pconsistency, ncol = 2,
layout_matrix = matrix(c(1, 1, 1, 2, 3, 4), nrow = 3, ncol = 2)
)
Figure 3 + Figure S3: Most frequent actions
We compute the mean number of actions per week for the different categories.
# Combine weekly data with country data
df_country_week <- df_action_weekly %>%
filter(sub %in% df_act_clean$sub) %>%
left_join(activity_table_ext %>% select(sub, country), by = c('sub')) %>%
filter(!is.na(habit_name)) # remove the right-padded weeks
df_weeks_user_active <- df_country_week %>%
group_by(sub) %>%
summarize(nr_weeks_active = length(unique(week_start)))
df_freq_avg <- df_country_week %>%
# Group by person and action, count number of actions
group_by(sub, habit_name) %>%
summarize(nr_act = n()) %>%
left_join(df_weeks_user_active, by = 'sub') %>%
# Calculate mean number of action per week
mutate(n_per_week = nr_act / nr_weeks_active) %>%
left_join(df_act_clean %>% select(sub, country) %>% distinct(.keep_all = TRUE), by = 'sub') %>%
left_join(df_country_week %>% select(habit_name, category) %>% distinct(.keep_all = TRUE), by = 'habit_name')
top10_actions_week_avg_category <- df_freq_avg %>%
group_by(habit_name) %>%
summarize(
n_per_week_avg = mean(n_per_week),
sd_per_week_avg = sd(n_per_week),
se_per_week_avg = sd(n_per_week) / sqrt(n())
) %>%
mutate(country = 'All') %>%
left_join(
df_freq_avg %>% ungroup() %>% select(habit_name, category) %>% distinct(habit_name, category),
by = 'habit_name'
) %>%
arrange(category, desc(n_per_week_avg)) %>%
filter(category %in% c('Energy', 'Mobility', 'Food', 'Purchase')) %>%
group_by(category) %>%
slice_head(n = 10)
df_freq_country_avg_category <- df_freq_avg %>%
filter(
country %in% c('IT', 'US', 'MX'),
habit_name %in% top10_actions_week_avg_category$habit_name
) %>%
group_by(habit_name, country) %>%
summarize(
n_per_week_avg = mean(n_per_week),
sd_per_week_avg = sd(n_per_week),
se_per_week_avg = sd(n_per_week) / sqrt(n())
) %>%
mutate(
country = case_when(
country == 'MX' ~ 'Mexico',
country == 'IT' ~ 'Italy',
country == 'US' ~ 'USA'
)
) %>%
left_join(
df_freq_avg %>% ungroup() %>% select(habit_name, category) %>% distinct(habit_name, category),
by = 'habit_name'
) %>%
select(colnames(top10_actions_week_avg_category))
# Useful later to distinguish the three top countries in terms of users
df_freq_country_avg_category <- bind_rows(df_freq_country_avg_category, top10_actions_week_avg_category)
df_freq_country_avg_category$habit_name <- factor(
df_freq_country_avg_category$habit_name, levels = rev(top10_actions_week_avg_category$habit_name)
)
df_freq_country_avg_category$category <- factor(
df_freq_country_avg_category$category, levels = c('Energy', 'Food', 'Mobility', 'Purchase')
)
df_freq_country_avg_category$country <- factor(
df_freq_country_avg_category$country, levels = rev(c('Italy', 'USA', 'Mexico', 'All'))
)
cols <- c('gray56', '#D95F02', '#7570B3', '#1B9E77')
pwidth <- 0.90
p_country_cat <- ggplot(
df_freq_country_avg_category %>% filter(country == 'All'),
aes(x = n_per_week_avg, y = habit_name, fill = country)
) +
geom_bar(stat = 'identity', position = position_dodge(width = pwidth), width = 0.75) +
geom_point(
aes(x = n_per_week_avg, y = habit_name), position = position_dodge(width = pwidth),
size = 2, show.legend = FALSE, color = 'black'
) +
geom_errorbar(
aes(xmin = n_per_week_avg - 1.96 * se_per_week_avg, xmax = n_per_week_avg + 1.96 * se_per_week_avg),
position = position_dodge(width = pwidth),
width = 0.40, linewidth = 1, show.legend = FALSE, color = 'black'
) +
geom_point(
aes(x = n_per_week_avg, y = habit_name, col = country), position = position_dodge(width = pwidth),
size = 1, show.legend = FALSE
) +
geom_errorbar(
aes(
xmin = n_per_week_avg - 1.96 * se_per_week_avg,
xmax = n_per_week_avg + 1.96 * se_per_week_avg,
col = country
),
position = position_dodge(width = pwidth),
width = 0.30, linewidth = 0.30,
show.legend = FALSE
) +
ylab('') +
xlab('Mean number of actions per week') +
scale_color_manual(values = cols) +
scale_fill_manual(
values = cols, labels = rev(c('Italy', 'USA', 'Mexico', 'All')),
guide = guide_legend(reverse = TRUE)
) +
scale_x_continuous(breaks = seq(0, 2, 0.50), limits = c(0, 2.2)) +
facet_wrap(~ category, scales = 'free_y', nrow = 2) +
theme_minimal() +
theme(
legend.title = element_blank(),
legend.position = 'none',
strip.text = element_text(size = 12),
plot.title = element_text(hjust = 0.50, size = 16),
axis.text.y = element_text(margin = margin(r = -5), size = 8),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(-2, 'lines')
) + guides(fill = guide_legend(reverse = TRUE), color = FALSE)
ggsave('figures/Figure3.pdf', p_country_cat, width = 10, height = 10)
p_country_cat
Below, we calculate the mean number of actions across categories for the different countries. This is Supplementary figure 2.
df_freq_avg_cat <- df_country_week %>%
# Group by person and action, count number of actions
group_by(sub, category) %>%
summarize(nr_act = n()) %>%
left_join(df_weeks_user_active, by = 'sub') %>%
# Calculate mean number of action per week
mutate(n_per_week = nr_act / nr_weeks_active) %>%
left_join(df_act_clean %>% select(sub, country) %>% distinct(.keep_all = TRUE), by = 'sub') %>%
filter(
category %in% c('Energy', 'Food', 'Mobility', 'Purchase'),
country %in% c('IT', 'US', 'MX')
) %>%
group_by(country, category) %>%
summarize(
n_per_week_avg = mean(n_per_week),
sd_per_week_avg = sd(n_per_week),
se_per_week_avg = sd(n_per_week) / sqrt(n())
) %>%
mutate(
category = factor(category, levels = rev(c('Energy', 'Food', 'Mobility', 'Purchase'))),
country = factor(country, levels = rev(c('IT', 'MX', 'US')))
)
cols <- c('grey56', '#D95F02', '#7570B3', '#1B9E77')[seq(2, 4)]
p_country_cat_diff <- ggplot(
df_freq_avg_cat, aes(x = n_per_week_avg, y = category, fill = country)
) +
geom_bar(stat = 'identity', position = position_dodge(width = pwidth), width = 0.75) +
geom_point(
aes(x = n_per_week_avg, y = category), position = position_dodge(width = pwidth),
size = 2, show.legend = FALSE, color = 'black'
) +
geom_errorbar(
aes(xmin = n_per_week_avg - 1.96 * se_per_week_avg, xmax = n_per_week_avg + 1.96 * se_per_week_avg),
position = position_dodge(width = pwidth),
width = 0.40, linewidth = 1, show.legend = FALSE, color = 'black'
) +
geom_point(
aes(x = n_per_week_avg, y = category, col = country), position = position_dodge(width = pwidth),
size = 1, show.legend = FALSE
) +
geom_errorbar(
aes(
xmin = n_per_week_avg - 1.96 * se_per_week_avg,
xmax = n_per_week_avg + 1.96 * se_per_week_avg,
col = country
),
position = position_dodge(width = pwidth),
width = 0.30, linewidth = 0.30,
show.legend = FALSE
) +
ylab('') +
xlab('Mean number of actions per week') +
scale_color_manual(values = cols) +
scale_fill_manual(
values = cols, labels = rev(c('Italy', 'USA', 'Mexico')),
guide = guide_legend(reverse = FALSE)
) +
theme_minimal() +
theme(
legend.title = element_blank(),
legend.position = 'top',
strip.text = element_text(size = 12),
plot.title = element_text(hjust = 0.50, size = 16),
axis.text.y = element_text(margin = margin(r = -5), size = 10),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(-2, 'lines')
) + guides(fill = guide_legend(reverse = TRUE), color = FALSE)
ggsave('figures/FigureS3.pdf', p_country_cat_diff, width = 8, height = 10)
p_country_cat_diff
Figure 4: Carbon footprint and number of actions
We read in the carbon footprint data.
df_footprint <- read.csv('footprint.csv') %>%
mutate(footprint_date = as.Date(footprint_date)) %>%
rename(sub = user_id)
df_footprint_qs <- read.csv('footprint_survey_answers.csv') %>%
rename(sub = user_id)
We merge the footprint data with the clean action data.
df_act_sum <- df_act_clean %>%
left_join(
df_footprint %>%
select(sub, footprint_date) %>%
distinct(sub, footprint_date),
by = 'sub'
) %>%
group_by(sub) %>%
mutate(
first_action_date = min(logged_habit),
last_action_date = max(logged_habit),
days_on_app = length(unique(logged_habit)),
before_footprint = logged_habit <= footprint_date
) %>%
group_by(sub, category) %>%
summarize(
n_act = n(),
n_act_before_footprint = sum(before_footprint, na.rm = TRUE),
joined_app_date = as.Date(logged_habit[1]),
first_action_date = as.Date(first_action_date[1]),
last_action_date = as.Date(last_action_date[1]),
days_on_app = days_on_app[1],
time_span = as.numeric(last_action_date - first_action_date),
co2saved = sum(co2)
)
# Collapse across categories
df_total_act <- df_act_sum %>%
group_by(sub) %>%
summarize(n_act = sum(n_act))
# Actions and footprints per category
df_info_spec <- df_act_sum %>%
left_join(
df_footprint %>%
pivot_wider(names_from = category, values_from = emission) %>%
mutate(footprint_date = as.Date(footprint_date)), by = 'sub'
) %>% ungroup() %>%
mutate(n_act_log = log(n_act, base = 2))
# Overall actions and overall footprint
df_info <- df_info_spec %>%
group_by(sub) %>%
reframe(
n_act = sum(n_act),
emissions = Transport + Home + Diet + Lifestyle,
footprint_date = footprint_date
) %>% distinct() %>% na.omit() %>%
mutate(emissions_log = log(emissions, base = 2))
df_spec <- na.omit(df_info_spec) %>%
mutate(
days_diff_joined_action = as.numeric(joined_app_date - first_action_date),
days_diff_action_footprint = as.numeric(footprint_date - first_action_date)
)
# Filled out carbon footprint in the study period
nrow(df_info)
## [1] 3794
We prepare the data and create a figure without regression lines.
psize <- 1.5
tsize <- 18
col_all <- 'gray56'
blue <- brewer.pal(9, 'Blues')[5]
green <- brewer.pal(9, 'Greens')[5]
purple <- brewer.pal(9, 'Purples')[5]
df_food <- filter(df_info_spec, category == 'Food')
df_lifestyle <- filter(df_info_spec, category == 'Purchase')
df_mob <- filter(df_info_spec, category == 'Mobility')
df_home <- filter(df_info_spec, category == 'Energy')
df_mob <- df_mob %>%
group_by(sub) %>%
summarize(n_act = sum(n_act)) %>%
left_join(df_mob %>% select(-n_act) %>% distinct(sub, .keep_all = TRUE), by = 'sub') %>%
mutate(category = 'Mobility') %>%
filter(!is.na(Transport), Transport > 0, !is.na(footprint_date)) %>%
mutate(
days_on_app = days_on_app - mean(days_on_app),
Transport_log = log(Transport, base = 2)
)
df_food <- df_food %>%
filter(!is.na(Diet), Diet > 0, !is.na(footprint_date)) %>%
mutate(
days_on_app = days_on_app - mean(days_on_app),
Diet_log = log(Diet, base = 2)
)
df_lifestyle <- df_lifestyle %>%
filter(!is.na(Lifestyle), Lifestyle > 0, !is.na(footprint_date)) %>%
mutate(
days_on_app = days_on_app - mean(days_on_app),
Lifestyle_log = log(Lifestyle, base = 2)
)
df_home <- df_home %>%
filter(!is.na(Home), Lifestyle > 0, !is.na(footprint_date)) %>%
mutate(
days_on_app = days_on_app - mean(days_on_app),
Home_log = log(Home, base = 2)
)
# Run regression adjusting for the number of days spent on the app
fit_mob <- regressionBF(Transport ~ n_act + days_on_app, data = df_mob)
fit_food <- regressionBF(Diet ~ n_act + days_on_app, data = df_food)
fit_lifestyle <- regressionBF(Lifestyle ~ n_act + days_on_app, data = df_lifestyle)
fit_home <- regressionBF(Home ~ n_act + days_on_app, data = df_home)
p_all <- ggplot(df_info, aes(x = n_act, y = emissions)) +
geom_point(color = col_all, size = psize, alpha = 0.60) +
scale_y_continuous(trans = 'log2', breaks = c(500, 2^seq(0, 6) * 1000)) +
scale_x_continuous(trans = 'log2', breaks = 2^seq(0, 14, 2)) +
labs(
y = 'Carbon footprint (kgCO2-eq)',
x = '',
title = 'Overall'
) +
theme_minimal() +
theme(plot.title = element_text(size = tsize, hjust = 0.50), legend.position = 'none')
p_mob <- ggplot(df_mob, aes(x = n_act, y = Transport)) +
geom_point(color = blue, size = psize, alpha = 0.60) +
scale_y_continuous(trans = 'log2', breaks = c(1, 10, 100, 2^seq(0, 6) * 1000)) +
scale_x_continuous(trans = 'log2', breaks = 2^seq(0, 13, 2)) +
labs(
title = 'Mobility',
x = '',
y = ''
) +
theme_minimal() +
theme(plot.title = element_text(size = tsize, hjust = 0.50), legend.position = 'none')
p_food <- ggplot(df_food, aes(x = n_act, y = Diet)) +
geom_point(color = green, size = psize, alpha = 0.60) +
scale_y_continuous(trans = 'log2', breaks = c(250, 500, 1000, 2000, 4000)) +
scale_x_continuous(trans = 'log2', breaks = 2^seq(0, 13, 2)) +
labs(
title = 'Food',
y = '',
x = 'Number of reported actions',
) +
theme_minimal() +
theme(plot.title = element_text(size = tsize, hjust = 0.50), legend.position = 'none')
p_lifestyle <- ggplot(df_lifestyle, aes(x = n_act, y = Lifestyle)) +
geom_point(color = purple, size = psize, alpha = 0.60) +
scale_y_continuous(trans = 'log2', breaks = c(1, 10, 100, 2^seq(0, 6) * 1000)) +
scale_x_continuous(trans = 'log2', breaks = 2^seq(0, 13, 2)) +
labs(
title = 'Purchase',
x = 'Number of reported actions',
y = 'Carbon footprint (kgCO2-eq)'
) +
theme_minimal() +
theme(plot.title = element_text(size = tsize, hjust = 0.50), legend.position = 'none')
p1 <- ggMarginal(p_all, type = 'histogram', fill = col_all)
p2 <- ggMarginal(p_mob, type = 'histogram', fill = blue)
p3 <- ggMarginal(p_lifestyle, type = 'histogram', fill = purple)
p4 <- ggMarginal(p_food, type = 'histogram', fill = green)
# figure <- grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
# ggsave('figures/Figure3_footprint.pdf', figure, width = 10, height = 10)
We run regressions adjusting for time spent on app and create a figure with regression lines.
df_emissions <- df_info %>%
left_join(df_info_spec %>% select(sub, days_on_app, n_act_log), by = 'sub') %>%
distinct(sub, .keep_all = TRUE)
# Sample sizes
nrow(df_emissions)
## [1] 3794
nrow(df_mob)
## [1] 2891
nrow(df_food)
## [1] 3288
nrow(df_lifestyle)
## [1] 2062
fit_all_log <- regressionBF(emissions_log ~ n_act_log + days_on_app, data = df_emissions)
fit_mob_log <- regressionBF(Transport_log ~ n_act_log + days_on_app, data = df_mob)
fit_food_log <- regressionBF(Diet_log ~ n_act_log + days_on_app, data = df_food)
fit_lifestyle_log <- regressionBF(Lifestyle_log ~ n_act_log + days_on_app, data = df_lifestyle)
fit_all_log[2] / fit_all_log[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 15.97999 ±0.01%
##
## Against denominator:
## emissions_log ~ n_act_log + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_mob_log[2] / fit_mob_log[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 7.089087 ±0%
##
## Against denominator:
## Transport_log ~ n_act_log + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_food_log[3] / fit_food_log[2]
## Bayes factor analysis
## --------------
## [1] n_act_log + days_on_app : 225527150447 ±0.01%
##
## Against denominator:
## Diet_log ~ days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_lifestyle_log[2] / fit_lifestyle_log[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 13.02507 ±0%
##
## Against denominator:
## Lifestyle_log ~ n_act_log + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
# Get effect for food
post <- posterior(fit_food_log[3], iterations = 4000)
apply(post, 2, mean)
## mu n_act_log days_on_app sig2 g
## 10.3221142379 -0.0349572802 0.0004506461 0.2276695481 0.1417344613
apply(post, 2, quantile, 0.025)
## mu n_act_log days_on_app sig2 g
## 1.030614e+01 -4.420376e-02 3.250563e-05 2.168796e-01 1.629026e-02
apply(post, 2, quantile, 0.975)
## mu n_act_log days_on_app sig2 g
## 10.3381887471 -0.0258063673 0.0008558891 0.2387698186 0.6958187926
get_post_pred <- function(fit, df, iterations = 1000, varname = 'n_act_log', type = 'other') {
post <- posterior(fit, iterations = iterations)
if (type == 'other') {
values <- seq(0, max(df[, varname]), 1)
} else {
values <- seq(1, 14, 1)
}
preds <- lapply(seq(iterations), function(iter) {
post[iter, 'mu'] + values * post[iter, varname]
})
# iterations x n_act
preds <- do.call('rbind', preds)
preds_mean <- apply(preds, 2, mean)
preds_lo <- apply(preds, 2, quantile, 0.025)
preds_hi <- apply(preds, 2, quantile, 0.975)
df <- data.frame('preds_mean' = preds_mean, 'ci_lo' = preds_lo, 'ci_hi' = preds_hi)
df[['n_act']] <- values
2^df
}
preds_all <- get_post_pred(fit_all_log[3], df_emissions, type = 'all')
preds_food <- get_post_pred(fit_food_log[3], df_food)
preds_mob <- get_post_pred(fit_mob_log[3], df_mob)
preds_lifestyle <- get_post_pred(fit_lifestyle_log[3], df_lifestyle)
p_all_reg <- p_all +
geom_ribbon(data = preds_all, aes(x = n_act, y = preds_mean, ymin = ci_lo, ymax = ci_hi), alpha = 0.30) +
geom_line(data = preds_all, aes(x = n_act, y = preds_mean), size = 0.75)
p_food_reg <- p_food +
geom_ribbon(data = preds_food, aes(x = n_act, y = preds_mean, ymin = ci_lo, ymax = ci_hi), alpha = 0.30) +
geom_line(data = preds_food, aes(x = n_act, y = preds_mean), size = 0.75)
p_mob_reg <- p_mob +
geom_ribbon(data = preds_mob, aes(x = n_act, y = preds_mean, ymin = ci_lo, ymax = ci_hi), alpha = 0.30) +
geom_line(data = preds_mob, aes(x = n_act, y = preds_mean), size = 0.75)
p_lifestyle_reg <- p_lifestyle +
geom_ribbon(data = preds_lifestyle, aes(x = n_act, y = preds_mean, ymin = ci_lo, ymax = ci_hi), alpha = 0.30) +
geom_line(data = preds_lifestyle, aes(x = n_act, y = preds_mean), size = 0.75)
p1_reg <- ggMarginal(p_all_reg, type = 'histogram', fill = col_all)
p2_reg <- ggMarginal(p_mob_reg, type = 'histogram', fill = blue)
p3_reg <- ggMarginal(p_lifestyle_reg, type = 'histogram', fill = purple)
p4_reg <- ggMarginal(p_food_reg, type = 'histogram', fill = green)
figure_reg <- grid.arrange(p1_reg, p2_reg, p3_reg, p4_reg, nrow = 2, ncol = 2)
ggsave('figures/Figure4.pdf', figure_reg, width = 10, height = 10)
grid.arrange(p1_reg, p2_reg, p3_reg, p4_reg, nrow = 2, ncol = 2)
Figure 5: Psychological variables and number of actions
We read the survey results in and prepare it for analysis.
df <- rbind(
read_excel('survey.xlsx', sheet = 1),
read_excel('survey.xlsx', sheet = 2)
)
cnames_eng <- c(
'language', 'email', 'gender', 'age', 'residence',
'environmentalist_a', 'environmentalist_b', 'environmentalist_c', 'environmentalist_d',
'fundamental_change', 'efficacy_a', 'efficacy_b', 'efficacy_c', 'inner_circle', 'app_helped',
'why_aworld_a', 'why_aworld_b'
)
cnames_it <- paste0(cnames_eng[-1], '_it')
other <- c(
'sha', 'politics_it', 'education_status_it', 'status_it', 'education_status',
'politics', 'status', 'education', 'education_it', 'submitted_at', 'token'
)
cnames <- c(cnames_eng, cnames_it, other)
colnames(df) <- cnames
df_fin <- df %>%
mutate(
gender = coalesce(gender, gender_it),
age = coalesce(age, age_it),
residence = coalesce(residence, residence_it),
environmentalist_a = coalesce(environmentalist_a, environmentalist_a_it),
environmentalist_b = coalesce(environmentalist_b, environmentalist_b_it),
environmentalist_c = coalesce(environmentalist_c, environmentalist_c_it),
environmentalist_d = coalesce(environmentalist_d, environmentalist_d_it),
environmentalist = (environmentalist_a + environmentalist_b + environmentalist_c + environmentalist_d) / 4,
fundamental_change = coalesce(fundamental_change, fundamental_change_it),
efficacy_a = coalesce(efficacy_a, efficacy_a_it),
efficacy_b = coalesce(efficacy_b, efficacy_b_it),
efficacy_c = coalesce(efficacy_c, efficacy_c_it),
efficacy = (efficacy_a + efficacy_b + efficacy_c) / 3,
inner_circle = coalesce(inner_circle, inner_circle_it),
app_helped = coalesce(app_helped, app_helped_it),
why_aworld_a = coalesce(why_aworld_a, why_aworld_a_it),
why_aworld_b = coalesce(why_aworld_b, why_aworld_b_it)
) %>%
select(-matches('_it'), -email) %>%
mutate(
residence = case_when(
residence == 'Periferia, vicino a una grande città ' ~ 'Suburb near a large city',
residence == 'Piccola città o paese' ~ 'Small city or town',
residence == 'Area rurale' ~ 'Rural area',
residence == 'Grande città ' ~ 'Large city',
TRUE ~ residence
),
app_helped = ifelse(app_helped == 'Si', 'Yes', app_helped)
) %>%
filter(!is.na(sha))
## Joining with action data
df_matching <- read.csv('aworld_bigquery.csv')
df_fin <- df_fin %>%
group_by(sha) %>%
mutate(n = n()) %>%
filter(n == 1) %>%
left_join(df_matching, by = 'sha')
df_match <- df_fin %>%
select(sub, language, gender, age, residence, environmentalist, efficacy, status, politics, inner_circle) %>%
inner_join(
df_act_sum %>%
select(sub, category, n_act),
by = 'sub'
) %>%
ungroup() %>%
filter(category %in% c('Food', 'Mobility', 'Purchase')) %>%
rename(n = n_act) %>%
select(-sha)
# Only 334 have logged behavior
n_logged <- nrow(df_fin %>% filter(sub %in% unique(activity_table_ext$sub)))
print(n_logged)
## [1] 334
nrow(df_fin) - n_logged
## [1] 755
# And then from those 334 we only retain 132, because the rest didn't make the data cleaning
n_cleaned <- nrow(df_match %>% distinct(sub))
n_cleaned
## [1] 132
n_logged - n_cleaned
## [1] 202
df_psych <- df_match %>%
select(sub, environmentalist, efficacy, inner_circle, n, category) %>%
pivot_longer(cols = -c(n, sub, category)) %>%
mutate(
name = case_when(
name == 'efficacy' ~ 'Perceived effectiveness of individual action',
name == 'environmentalist' ~ 'Environmentalist identity',
name == 'inner_circle' ~ 'Climate concern in inner circle'
),
name = factor(
name,
levels = c(
'Environmentalist identity',
'Perceived effectiveness of individual action',
'Climate concern in inner circle'
)
)
) %>%
left_join(
df_act_sum %>%
select(sub, days_on_app) %>%
distinct(sub, days_on_app) %>%
ungroup() %>%
mutate(
days_on_app_raw = days_on_app,
days_on_app = days_on_app - mean(days_on_app)
)
, by = 'sub'
) %>%
mutate(
log_n = log(n, base = 2),
category = factor(category, levels = c('Mobility', 'Purchase', 'Food'))
)
Description of the sample.
x <- df_psych %>% distinct(sub, .keep_all = TRUE)
mean(x$days_on_app_raw)
## [1] 31.84848
median(x$days_on_app_raw)
## [1] 10
sd(x$days_on_app_raw)
## [1] 72.28912
df_psych %>%
distinct(sub, category, .keep_all = TRUE) %>%
group_by(category) %>%
summarize(n = sum(n))
## # A tibble: 3 × 2
## category n
## <fct> <int>
## 1 Mobility 3310
## 2 Purchase 1253
## 3 Food 9358
We create a scatterplot figure and later add the regression lines.
blue <- brewer.pal(9, 'Blues')[5]
green <- brewer.pal(9, 'Greens')[5]
purple <- brewer.pal(9, 'Purples')[5]
cols <- c(blue, purple, green)
ppsych <- ggplot(df_psych, aes(y = value, x = n, color = category)) +
geom_jitter(width = 0.1, height = 0.1, alpha = 1) +
# geom_jitter(width = 0, height = 0, alpha = 0.50) +
scale_color_manual('', values = cols) +
scale_x_continuous(
name = 'Number of actions',
labels = label_number(big.mark = ','),
trans = 'log2', breaks = c(1, 10, 100, 1000, 10000, 10000, 100000)
) +
scale_y_continuous(name = '', breaks = seq(1, 7)) +
facet_wrap(~ name) +
theme_minimal() +
theme(
legend.position = 'top',
strip.text = element_text(size = 12)
# legend.title = element_blank()
) + coord_flip()
# Number of participants of the survey
length(unique(df_psych$sub))
## [1] 132
# Total number of actions by the users who filled out the survey
sum((activity_table_ext %>% filter(sub %in% unique(df_psych$sub)))$n)
## [1] 68035
We run regressions adjusting for time spent on the app and create a figure with the regression lines.
df_identity <- filter(df_psych, !is.na(days_on_app), name == 'Environmentalist identity')
df_effectiveness <- filter(df_psych, !is.na(days_on_app), name == 'Perceived effectiveness of individual action')
df_concern <- filter(df_psych, !is.na(days_on_app), name == 'Climate concern in inner circle')
# Per category
## Environmentalist identity
fit_identity_mob <- regressionBF(log_n ~ value + days_on_app, data = df_identity %>% filter(category == 'Mobility'))
fit_identity_mob[2] / fit_identity_mob[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 4.336416 ±0%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_identity_purchase <- regressionBF(log_n ~ value + days_on_app, data = df_identity %>% filter(category == 'Purchase'))
fit_identity_purchase[2] / fit_identity_purchase[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 2.123689 ±0%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_identity_food <- regressionBF(log_n ~ value + days_on_app, data = df_identity %>% filter(category == 'Food'))
fit_identity_food[2] / fit_identity_food[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 5.951668 ±0.01%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
## Effectiveness
fit_eff_mob <- regressionBF(log_n ~ value + days_on_app, data = df_effectiveness %>% filter(category == 'Mobility'))
fit_eff_mob[2] / fit_eff_mob[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 6.132998 ±0%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_eff_purchase <- regressionBF(log_n ~ value + days_on_app, data = df_effectiveness %>% filter(category == 'Purchase'))
fit_eff_purchase[2] / fit_eff_purchase[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 3.438857 ±0%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_eff_food <- regressionBF(log_n ~ value + days_on_app, data = df_effectiveness %>% filter(category == 'Food'))
fit_eff_food[2] / fit_eff_food[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 7.683544 ±0.01%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
## Concern
fit_concern_mob <- regressionBF(log_n ~ value + days_on_app, data = df_concern %>% filter(category == 'Mobility'))
fit_concern_mob[2] / fit_concern_mob[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 5.953516 ±0%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_concern_purchase <- regressionBF(log_n ~ value + days_on_app, data = df_concern %>% filter(category == 'Purchase'))
fit_concern_purchase[2] / fit_concern_purchase[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 2.495866 ±0%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
fit_concern_food <- regressionBF(log_n ~ value + days_on_app, data = df_concern %>% filter(category == 'Food'))
fit_concern_food[2] / fit_concern_food[3]
## Bayes factor analysis
## --------------
## [1] days_on_app : 7.004589 ±0.01%
##
## Against denominator:
## log_n ~ value + days_on_app
## ---
## Bayes factor type: BFlinearModel, JZS
get_post_pred2 <- function(
fit, df, iterations = 1000,
category = 'Food', varname = 'log_n', type = 'other'
) {
if (!is.null(category)) {
df <- df[df$category == category, ]
}
post <- posterior(fit, iterations = iterations)
values <- seq(1, 7, 0.05) - mean(seq(1, 7, 0.05))
preds <- lapply(seq(iterations), function(iter) {
post[iter, 'mu'] + values * post[iter, varname]
})
# iterations x n_act
preds <- do.call('rbind', preds)
preds_mean <- apply(preds, 2, mean)
preds_lo <- apply(preds, 2, quantile, 0.025)
preds_hi <- apply(preds, 2, quantile, 0.975)
df <- data.frame(
'preds_mean' = preds_mean, 'ci_lo' = preds_lo,
'ci_hi' = preds_hi, 'category' = ifelse(is.null(category), '', category)
)
df[['values']] <- seq(1, 7, 0.05)
df
}
preds_identity <- bind_rows(
get_post_pred2(fit_identity_food[3], df_identity, category = 'Food', varname = 'value'),
get_post_pred2(fit_identity_mob[3], df_identity, category = 'Mobility', varname = 'value'),
get_post_pred2(fit_identity_purchase[3], df_identity, category = 'Purchase', varname = 'value')
)
preds_effectiveness <- bind_rows(
get_post_pred2(fit_eff_food[3], df_effectiveness, category = 'Food', varname = 'value'),
get_post_pred2(fit_eff_mob[3], df_effectiveness, category = 'Mobility', varname = 'value'),
get_post_pred2(fit_eff_purchase[3], df_effectiveness, category = 'Purchase', varname = 'value')
)
preds_concern <- bind_rows(
get_post_pred2(fit_concern_food[3], df_concern, category = 'Food', varname = 'value'),
get_post_pred2(fit_concern_mob[3], df_concern, category = 'Mobility', varname = 'value'),
get_post_pred2(fit_concern_purchase[3], df_concern, category = 'Purchase', varname = 'value')
)
preds <- bind_rows(
preds_identity %>% mutate(name = 'Environmentalist identity'),
preds_effectiveness %>% mutate(name = 'Perceived effectiveness of individual action'),
preds_concern %>% mutate(name = 'Climate concern in inner circle')
) %>%
mutate(
name = factor(
name,
levels = c(
'Environmentalist identity',
'Perceived effectiveness of individual action',
'Climate concern in inner circle'
)
),
category = factor(category, levels = c('Mobility', 'Purchase', 'Food'))
)
ppsych_reg <- ppsych + coord_flip() +
geom_ribbon(
data = preds,
aes(x = 2^preds_mean, y = values, xmin = 2^ci_lo, xmax = 2^ci_hi, fill = category),
alpha = 0.2, inherit.aes = FALSE
) +
scale_fill_manual(values = cols) +
geom_line(data = preds, aes(x = 2^preds_mean, y = values, color = category), size = 1) +
guides(fill = FALSE)
ggsave('figures/Figure5.pdf', ppsych_reg, width = 11, height = 5)
ppsych_reg